In [2]:
# loading packages
import os, sys, glob
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import scanpy.external as sce
import scrublet as scr
import decoupler as dc
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, dpi_save=2000, figsize=(4, 4), color_map='OrRd', frameon=False, format='svg')
plt.rcParams['svg.fonttype'] = 'none'
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'
In [3]:
# reading
adata_list = []
for i in ['DMSO', 'CIS', 'PAC', 'TFT', 'DOX']:
adata = sc.read_10x_h5(f"/nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-{i}/outs/filtered_feature_bc_matrix.h5")
adata.obs.index.name = None
adata.var.index.name = None
adata.var_names_make_unique()
adata.obs['sample'] = i
adata.obs.index = adata.obs['sample'] + '_' + adata.obs.index
adata.var['mt'] = adata.var_names.str.startswith('MT-') # change
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, inplace=True)
sce.pp.scrublet(adata)
adata_list.append(adata)
adata = adata_list[0].concatenate(adata_list[1:], index_unique=None)
adata.var = adata.var[['gene_ids', 'mt']]
# quality control
sc.pp.filter_cells(adata, min_counts=2000)
sc.pp.filter_cells(adata, min_genes=500)
adata = adata[adata.obs['pct_counts_mt'] < 10, :]
adata = adata[adata.obs['doublet_score'] < 0.4, :]
# processing
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata
sc.pp.highly_variable_genes(
adata,
n_top_genes=2000,
subset=False,
layer="counts",
flavor="seurat_v3",
)
sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=50, use_rep='X_pca')
sc.tl.umap(adata)
cell_cycle_genes = [x.strip()
for x in open('/home/yongjunkoh/references/regev_lab_cell_cycle_genes.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
sc.tl.score_genes_cell_cycle(adata, s_genes, g2m_genes)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-DMSO/outs/filtered_feature_bc_matrix.h5 (0:00:02)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Running Scrublet
filtered out 12485 genes that are detected in less than 3 cells
normalizing counts per cell
finished (0:00:00)
extracting highly variable genes
finished (0:00:01)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy. view_to_actual(adata)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.62
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.0%
Overall doublet rate:
Expected = 5.0%
Estimated = 0.0%
Scrublet finished (0:00:18)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-CIS/outs/filtered_feature_bc_matrix.h5
(0:00:01)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Running Scrublet
filtered out 10909 genes that are detected in less than 3 cells
normalizing counts per cell
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy. view_to_actual(adata)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.61
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.3%
Overall doublet rate:
Expected = 5.0%
Estimated = 10.0%
Scrublet finished (0:00:17)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-PAC/outs/filtered_feature_bc_matrix.h5
(0:00:01)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Running Scrublet
filtered out 12389 genes that are detected in less than 3 cells
normalizing counts per cell
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy. view_to_actual(adata)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.61
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
Expected = 5.0%
Estimated = 22.2%
Scrublet finished (0:00:19)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-TFT/outs/filtered_feature_bc_matrix.h5
(0:00:01)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Running Scrublet
filtered out 12245 genes that are detected in less than 3 cells
normalizing counts per cell
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy. view_to_actual(adata)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.57
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
Expected = 5.0%
Estimated = 12.2%
Scrublet finished (0:00:14)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-DOX/outs/filtered_feature_bc_matrix.h5
(0:00:01)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Running Scrublet
filtered out 11512 genes that are detected in less than 3 cells
normalizing counts per cell
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy. view_to_actual(adata)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.59
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
Expected = 5.0%
Estimated = 8.0%
Scrublet finished (0:00:15)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour. [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
filtered out 4175 cells that have less than 2000 counts
filtered out 24 cells that have less than 500 genes expressed
normalizing counts per cell
finished (0:00:00)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
'highly_variable', boolean vector (adata.var)
'highly_variable_rank', float vector (adata.var)
'means', float vector (adata.var)
'variances', float vector (adata.var)
'variances_norm', float vector (adata.var)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:08)
computing neighbors
2025-08-28 02:33:23.329965: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2025-08-28 02:33:23.605620: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. 2025-08-28 02:33:25.232545: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. 2025-08-28 02:33:25.234229: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2025-08-28 02:33:28.793140: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:45)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:27)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/tools/_score_genes.py:151: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. for cut in np.unique(obs_cut.loc[gene_list]):
finished: added
'S_score', score of gene set (adata.obs).
344 total control genes are used. (0:00:02)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/tools/_score_genes.py:151: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead. for cut in np.unique(obs_cut.loc[gene_list]):
finished: added
'G2M_score', score of gene set (adata.obs).
214 total control genes are used. (0:00:02)
--> 'phase', cell cycle phase (adata.obs)
In [4]:
sc.pl.umap(adata, color=['sample'])
In [12]:
# changing color and orders
adata.obs['sample'] = adata.obs['sample'].cat.reorder_categories(['DMSO', 'CIS', 'PAC', 'TFT', 'DOX'])
adata.uns['sample_colors'] = ['#e3e3dd', '#28b463', '#2f86c1', '#b03a2e', '#7c3b97']
sc.pl.umap(adata, color=['sample'])
In [11]:
# clustering
sc.tl.leiden(adata, resolution=0.5, key_added='leiden_0.5')
sc.pl.umap(adata, color=['leiden_0.5'])
running Leiden clustering
finished: found 9 clusters and added
'leiden_0.5', the cluster labels (adata.obs, categorical) (0:00:04)
In [9]:
# deg analysis
adata.uns['log1p']['base'] = None
sc.tl.rank_genes_groups(adata,
groupby='leiden_0.5',
method='wilcoxon',
key_added='leiden_0.5_wilcoxon')
with pd.ExcelWriter('hela-wilcoxon-markers_230626_yongjunkoh.xlsx') as writer:
for i in adata.obs['leiden_0.5'].cat.categories:
sc.get.rank_genes_groups_df(adata, group=i, key='leiden_0.5_wilcoxon').to_excel(writer, sheet_name=i)
In [10]:
# inspecting genes
sc.pl.umap(adata, color=['TYMS'])
sc.pl.umap(adata, color=['BCL2'])
sc.pl.umap(adata, color=['CDK6'])
In [13]:
# decoupler progeny (pathway analysis)
progeny = dc.get_progeny(organism='human', top=500)
dc.run_mlm(
mat=adata,
net=progeny,
source='source',
target='target',
weight='weight',
verbose=True
)
adata.obsm['progeny_mlm_estimate'] = adata.obsm['mlm_estimate'].copy()
adata.obsm['progeny_mlm_pvals'] = adata.obsm['mlm_pvals'].copy()
adata.obsm['progeny_mlm_estimate'] = adata.obsm['mlm_estimate'].copy()
adata.obsm['progeny_mlm_pvals'] = adata.obsm['mlm_pvals'].copy()
acts = dc.get_acts(adata, obsm_key='mlm_estimate')
sc.pl.matrixplot(acts, var_names=acts.var_names, groupby='sample', dendrogram=True, standard_scale='var', colorbar_title='Z-scaled scores', cmap='RdBu_r')
sc.pl.umap(acts, color=['p53'], cmap='RdBu_r', vcenter=0)
4835 features of mat are empty, they will be removed. Running mlm on mat with 32952 samples and 31766 targets for 14 sources.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:56<00:00, 14.13s/it]
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using data matrix X directly
Storing dendrogram info using `.uns['dendrogram_sample']`
In [14]:
acts_p = acts.copy()
In [15]:
# decoupler collectri (tf analysis)
net = dc.get_collectri(organism='human', split_complexes=False)
dc.run_ulm(
mat=adata,
net=net,
source='source',
target='target',
weight='weight',
verbose=True
)
adata.obsm['collectri_ulm_estimate'] = adata.obsm['ulm_estimate'].copy()
adata.obsm['collectri_ulm_pvals'] = adata.obsm['ulm_pvals'].copy()
acts = dc.get_acts(adata, obsm_key='ulm_estimate')
df = dc.rank_sources_groups(acts, groupby='sample', reference='rest', method='t-test_overestim_var')
n_markers = 10
source_markers = df.groupby('group').head(n_markers).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
sc.pl.matrixplot(acts, source_markers, 'sample', dendrogram=True, standard_scale='var', colorbar_title='Z-scaled scores', cmap='RdBu_r')
4835 features of mat are empty, they will be removed.
Running ulm on mat with 32952 samples and 31766 targets for 750 sources.
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_sample']`
In [18]:
# save
sc.settings.set_figure_params(dpi=100, dpi_save=2000, figsize=(4, 4), color_map='OrRd', frameon=False)
sc.pl.umap(adata, color=['sample'], save='_sample.svg')
sc.pl.umap(adata, color=['leiden_0.5'], save='_cluster.svg')
sc.pl.umap(adata, color=['TYMS'], save='_TYMS.svg')
sc.pl.umap(adata, color=['BCL2'], save='_BCL2.svg')
sc.pl.umap(adata, color=['CDK6'], save='_CDK6.svg')
sc.pl.umap(acts_p, color=['p53'], cmap='RdBu_r', vcenter=0, save='_p53.svg')
sc.pl.matrixplot(acts, source_markers, 'sample', dendrogram=True, standard_scale='var', colorbar_title='Z-scaled scores', cmap='RdBu_r', save='collectri.svg')
WARNING: saving figure to file figures/umap_sample.svg
WARNING: saving figure to file figures/umap_cluster.svg
WARNING: saving figure to file figures/umap_TYMS.svg
WARNING: saving figure to file figures/umap_BCL2.svg
WARNING: saving figure to file figures/umap_CDK6.svg
WARNING: saving figure to file figures/umap_p53.svg
WARNING: saving figure to file figures/matrixplot_collectri.svg
In [19]:
# save
sc.settings.set_figure_params(dpi=100, dpi_save=2000, figsize=(4, 4), color_map='OrRd', frameon=False)
sc.pl.umap(adata, color=['sample'], save='_sample.pdf')
sc.pl.umap(adata, color=['leiden_0.5'], save='_cluster.pdf')
sc.pl.umap(adata, color=['TYMS'], save='_TYMS.pdf')
sc.pl.umap(adata, color=['BCL2'], save='_BCL2.pdf')
sc.pl.umap(adata, color=['CDK6'], save='_CDK6.pdf')
sc.pl.umap(acts_p, color=['p53'], cmap='RdBu_r', vcenter=0, save='_p53.pdf')
sc.pl.matrixplot(acts, source_markers, 'sample', dendrogram=True, standard_scale='var', colorbar_title='Z-scaled scores', cmap='RdBu_r', save='collectri.pdf')
WARNING: saving figure to file figures/umap_sample.pdf
WARNING: saving figure to file figures/umap_cluster.pdf
WARNING: saving figure to file figures/umap_TYMS.pdf
WARNING: saving figure to file figures/umap_BCL2.pdf
WARNING: saving figure to file figures/umap_CDK6.pdf
WARNING: saving figure to file figures/umap_p53.pdf
WARNING: saving figure to file figures/matrixplot_collectri.pdf
In [ ]: